Instruction¶

I have a typical project of predicting the NYC uber/lyft trip demand. The dataset is available from Jan2022 to March 2023. The area is already divided into different locations. and I want the predicted demand for each location every 15 mins

Problem statment¶

The goal of this project is to predict the demand for Uber/Lyft trips in different locations of NYC every 15 minutes, using a dataset spanning from January 2022 to March 2023. The dataset includes information such as the dispatching base number, pickup datetime, drop-off datetime, pickup location ID, drop-off location ID, SR_Flag, and affiliated base number

In [1]:
import pandas as pd
import glob
import tqdm
import pandas as pd
import plotly.graph_objects as go
from statsmodels.tsa.arima.model import ARIMA
from dateutil.relativedelta import relativedelta
import numpy as np
from pmdarima import auto_arima
In [2]:
data_list_path = glob.glob('Datasets/fhv_tripdata_2022-2023_in_csv/*.csv')

list_df = []
for path in data_list_path:
    print(path)
    # Step 1: Preprocess the Dataset
    df = pd.read_csv(path)
    list_df.append(df)
    
df =  pd.concat(list_df)

interested_features = ['pickup_datetime','PUlocationID']
df = df[interested_features]
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-09.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-02.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-04.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-07.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-01.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-06.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-08.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2023-03.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-11.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-12.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2023-02.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-03.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2023-01.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-05.csv
Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-10.csv
In [3]:
import pandas as pd
import pmdarima as pm
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split

print('Number of Rows Before Removing NaN:', df.shape[0])
removed_nan_df = df.dropna()
print('Number of Rows After Removing NaN:', removed_nan_df.shape[0])
Number of Rows Before Removing NaN: 17712727
Number of Rows After Removing NaN: 4164902
In [4]:
import pandas as pd
import plotly.graph_objects as go
from sklearn.model_selection import train_test_split
from prophet import Prophet

print('Number of Rows Before Removing NaN:', df.shape[0])
removed_nan_df = df.dropna()
print('Number of Rows After Removing NaN:', removed_nan_df.shape[0])

location_ids = removed_nan_df['PUlocationID'].unique().tolist()

loop_count = 0
for lc_id in location_ids:
    print('Location ID:', lc_id)
    df_subset = removed_nan_df[removed_nan_df['PUlocationID'] == lc_id]
    df_subset['pickup_datetime'] = pd.to_datetime(df_subset['pickup_datetime'])
    df_subset = df_subset.sort_values('pickup_datetime')
    df_subset = df_subset.set_index('pickup_datetime')
    df_subset = df_subset['PUlocationID'].resample('1H').count()
    df_subset = df_subset.reset_index()

    # Split data into training and testing sets
    train_size = int(len(df_subset) * 0.95)
    train_data = df_subset[:train_size]
    test_data = df_subset[train_size:]

    # Prepare data for Prophet model
    prophet_train_data = train_data.rename(columns={'pickup_datetime': 'ds', 'PUlocationID': 'y'})

    # Create and fit the Prophet model
    model = Prophet(
        seasonality_mode='additive',
        daily_seasonality=True,  # Disable daily seasonality
        weekly_seasonality=True,  # Enable weekly seasonality
        yearly_seasonality=False,  # Disable yearly seasonality
    )
    model.fit(prophet_train_data)

    # Generate future dates for prediction
    future_dates = model.make_future_dataframe(periods=len(test_data), freq='H')

    # Make predictions
    forecast = model.predict(future_dates)
    forecast = forecast[['ds', 'yhat']][-len(test_data):]

    # Plotting
    fig = go.Figure()
    fig.add_trace(go.Scatter(x=prophet_train_data['ds'], y=prophet_train_data['y'], mode='lines+markers', name='Training Data'))
    fig.add_trace(go.Scatter(x=test_data['pickup_datetime'], y=test_data['PUlocationID'], mode='lines+markers', name='Testing Data'))
    fig.add_trace(go.Scatter(x=forecast['ds'], y=forecast['yhat'], mode='lines+markers', name='Prophet Forecast'))
    fig.update_layout(title=f'PickLocation ID: {lc_id} - Facebook Prophet', xaxis_title='Time', yaxis_title='Number Drives')
    fig.show()

    loop_count += 1
    if loop_count > 5:
        break
/home/iffi/anaconda3/envs/sep_darts_2/lib/python3.11/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html
  from .autonotebook import tqdm as notebook_tqdm
Number of Rows Before Removing NaN: 17712727
Number of Rows After Removing NaN: 4164902
Location ID: 12.0
/tmp/ipykernel_18793/3238944328.py:16: SettingWithCopyWarning: 
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df_subset['pickup_datetime'] = pd.to_datetime(df_subset['pickup_datetime'])
12:04:51 - cmdstanpy - INFO - Chain [1] start processing
12:04:53 - cmdstanpy - INFO - Chain [1] done processing
Location ID: 89.0
/tmp/ipykernel_18793/3238944328.py:16: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

12:04:54 - cmdstanpy - INFO - Chain [1] start processing
12:04:55 - cmdstanpy - INFO - Chain [1] done processing
Location ID: 87.0
/tmp/ipykernel_18793/3238944328.py:16: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

12:04:57 - cmdstanpy - INFO - Chain [1] start processing
12:04:58 - cmdstanpy - INFO - Chain [1] done processing
Location ID: 230.0
/tmp/ipykernel_18793/3238944328.py:16: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

12:05:00 - cmdstanpy - INFO - Chain [1] start processing
12:05:01 - cmdstanpy - INFO - Chain [1] done processing
Location ID: 73.0
/tmp/ipykernel_18793/3238944328.py:16: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

12:05:03 - cmdstanpy - INFO - Chain [1] start processing
12:05:04 - cmdstanpy - INFO - Chain [1] done processing
/tmp/ipykernel_18793/3238944328.py:16: SettingWithCopyWarning:


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy

12:05:05 - cmdstanpy - INFO - Chain [1] start processing
Location ID: 93.0
12:05:07 - cmdstanpy - INFO - Chain [1] done processing
In [5]:
# df_subset.values
In [6]:
# df_subset
In [7]:
# df = df_subset

# df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
# df = df.set_index('pickup_datetime')

# df['pickups_per_hour'] = df['PUlocationID'].resample('3H').count()
# df
In [8]:
df
Out[8]:
pickup_datetime PUlocationID
0 2022-09-01 00:34:00 NaN
1 2022-09-01 00:10:00 NaN
2 2022-09-01 00:58:35 NaN
3 2022-09-01 00:50:00 NaN
4 2022-09-01 00:45:00 NaN
... ... ...
1174983 2022-10-31 23:30:36 NaN
1174984 2022-10-31 23:15:13 NaN
1174985 2022-10-31 23:41:39 NaN
1174986 2022-10-31 23:15:23 NaN
1174987 2022-10-31 23:33:06 NaN

17712727 rows × 2 columns

In [9]:
# import pandas as pd
# import matplotlib.pyplot as plt

# # Step 1: Preprocess the Dataset
# df = pd.read_csv('Datasets/fhv_tripdata_2022-2023_in_csv/fhv_tripdata_2022-01.csv')
# df['pickup_datetime'] = pd.to_datetime(df['pickup_datetime'])
# df['dropOff_datetime'] = pd.to_datetime(df['dropOff_datetime'])
# df.set_index('pickup_datetime', inplace=True)

# # Step 2: Resample the Dataset
# demand_15_mints = df[['PUlocationID', 'DOlocationID']].resample('15T').size()
# demand_30_mints = df[['PUlocationID', 'DOlocationID']].resample('30T').size()
# demand_60_mints = df[['PUlocationID', 'DOlocationID']].resample('1h').size()

# # Step 3: Predict the Demand (using your preferred model)

# # Step 4: Visualize the Demand
# demand_15_mints.plot(figsize=(12, 6))
# plt.xlabel('Time')
# plt.ylabel('Demand')
# plt.title('NYC Uber/Lyft Trip Demand')
# plt.show()
In [10]:
# # Step 4: Visualize the Demand
# demand_30_mints.plot(figsize=(12, 6))
# plt.xlabel('Time')
# plt.ylabel('Demand')
# plt.title('NYC Uber/Lyft Trip Demand')
# plt.show()
In [11]:
# # Step 4: Visualize the Demand
# demand_60_mints.plot(figsize=(12, 6))
# plt.xlabel('Time')
# plt.ylabel('Demand')
# plt.title('NYC Uber/Lyft Trip Demand')
# plt.show()
In [12]:
# demand_30_mints